#This R Script Re-produces the topic model (STM) output used to create Table 1/Figure A.2 and Figure A.1

#clear memory 
rm( list=ls() ) 

#set seed
set.seed(9)

#load necessary packages
library(stm)
library(stringr)

################################################################
#######################Read In Data#############################
################################################################

#set working directory
setwd("")

#read in data and doublechecks rows
main.data<-read.csv("requests.translated.csv",header=TRUE)
nrow(main.data)

#light preprocessing
main.data$Request<-gsub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", main.data$Request)

#code security Agency measure
main.data$security<-ifelse(grepl("Police",main.data$Agency)|grepl("Surveillance",main.data$Agency)|grepl("Security Bureau",main.data$Agency)|grepl("Department of Justice",main.data$Agency)|grepl("courts and tribunals",main.data$Agency),1,0)

#code intermediate DVs
main.data$deny<-ifelse(grepl("refused",main.data$Status)|grepl("did not have",main.data$Status)|grepl("overdue",main.data$Status)|grepl("postal mail",main.data$Status)|grepl("administrator attention",main.data$Status),1,0)
main.data$partial<-ifelse(grepl("partially successful",main.data$Status),1,0)
main.data$success<-ifelse(grepl("The request was successful",main.data$Status),1,0)
main.data$denyord<-ifelse(main.data$deny==1,2,ifelse(main.data$partial==1,1,0))

#fix some intermediate DV cases
main.data$deny[grepl("waiting",main.data$Status)]<-NA
main.data$deny[grepl("has had an unusual response",main.data$Status)]<-NA
main.data$deny[grepl("withdrawn",main.data$Status)]<-NA
main.data$deny[grepl("There was a delivery error or similar",main.data$Status)]<-NA
main.data$denyord[grepl("waiting",main.data$Status)]<-NA
main.data$denyord[grepl("has had an unusual response",main.data$Status)]<-NA
main.data$denyord[grepl("withdrawn",main.data$Status)]<-NA
main.data$denyord[grepl("There was a delivery error or similar",main.data$Status)]<-NA

#code final DVs
main.data$denyordcons<-main.data$denyord
main.data$denyordcons[(grepl("postal mail",main.data$Status)|grepl("administrator attention",main.data$Status))]<-NA
main.data$denyordlarge<-main.data$denyord
main.data$denyordlarge<-ifelse(grepl("refused",main.data$Status),3,main.data$denyord)
main.data$denyordlargecons<-main.data$denyordlarge
main.data$denyordlargecons[(grepl("postal mail",main.data$Status)|grepl("administrator attention",main.data$Status))]<-NA

####################################################################
######################Prepare for STM Analysis######################
####################################################################

#format text data into an input matrix
temp<-textProcessor(documents=main.data$Request,metadata=main.data,lowercase=TRUE, removestopwords=TRUE, removenumbers=TRUE, 
              	removepunctuation=TRUE, stem=TRUE,
		sparselevel=.9999)

#extract items from formatted text data for use in STM model
meta<-temp$meta
vocab<-temp$vocab
docs<-temp$documents
out <- prepDocuments(docs, vocab, meta)
docs<-out$documents
vocab<-out$vocab
meta <-out$meta

########################################################
#################Topic Number Selection#################
########################################################

#multi-topic exploration via searchK
topics.searchK <- searchK(documents = docs, 
                     vocab = vocab,
                     K = c(10,20,30,40,50,60,70), #specify K to try
                     proportion = 0.5, # default
                     heldout.seed = 1234, # optional
                     M = 10, # default
                     cores = 1, # default
                     prevalence =~1,
                     max.em.its = 200,
                     data = meta,
                     init.type = "Spectral",
                     verbose=TRUE)

#topic number plots (Figure A1)
plot(topics.searchK)

#set seed
set.seed(02138)

#run model with 20 topics with issue number as sole covariate, and
#select the pareto dominant run of each model in terms of exclusivity and semantic coherence
#note: typically we should set runs to 20, I use 10 runs here in the interest of runtime...
storage.mod.out <- manyTopics(docs, vocab, c(20), max.em.its=500, prevalence=
    ~1, data=meta, runs=25)
#store output from (one of) the optimal run(s) of the ten topic model
twenty.mod.out<-storage.mod.out$out[[1]]

#summarize the k=20 model output
summary(twenty.mod.out)

################################################################
#####################Summarize Top Words########################
################################################################

#Note: This produces Table 1 and Figure A.2 inputs
topwords<-NULL
topnumbers<-NULL
for(i in 1:20){
	#topnumbers<-rbind(topnumbers,paste("Topic",i,sep=" "))
	topnumbers<-rbind(topnumbers,paste(i,sep=" "))
	topwords<-rbind(topwords,paste(labelTopics(twenty.mod.out,  n=20)$frex[i,],collapse=", "))
}

topforplot<-cbind(topnumbers,topwords)
colnames(topforplot)<-c("Topic Number","Top 20 Words")

#rm( list=ls() ) 
par(mar=c(0.001,0.001,0.001,0.001))
local.ylab<-("")
local.xlab<-("")
local.ylim<-c(.40,.85)
local.xlim<-c(0.07,1)
plot(x=c(.08,.08,.08,.08),y=c(.40,.5,.75,.85), type="l",  xlim=local.xlim, ylim=local.ylim,xlab=local.xlab,ylab=local.ylab,axes=FALSE, ann=FALSE)
par(new=TRUE)
plot(x=c(.88,.88,.88,.88),y=c(.40,.5,.75,.85), type="l",  xlim=local.xlim, ylim=local.ylim,xlab=local.xlab,ylab=local.ylab,axes=FALSE, ann=FALSE)
par(new=TRUE)
plot(x=c(1.03,1.03,1.03,1.03),y=c(.40,.5,.75,.85), type="l",  xlim=local.xlim, ylim=local.ylim,xlab=local.xlab,ylab=local.ylab,axes=FALSE, ann=FALSE)
par(new=TRUE)
plot(x=c(.04,.04,.04,.04),y=c(.40,.5,.75,.85), type="l",  xlim=local.xlim, ylim=local.ylim,xlab=local.xlab,ylab=local.ylab,axes=FALSE, ann=FALSE)
par(new=TRUE)
plot(x=c(0.04,.1,.5,.78,1.03),y=c(.82,.82,.82,.82,.82), type="l",  xlim=local.xlim, ylim=local.ylim,xlab=local.xlab,ylab=local.ylab,axes=FALSE, ann=FALSE)
par(new=TRUE)
plot(x=c(0.04,.1,.5,.78,1.03),y=c(.85,.85,.85,.85,.85), type="l",  xlim=local.xlim, ylim=local.ylim,xlab=local.xlab,ylab=local.ylab,axes=FALSE, ann=FALSE)
par(new=TRUE)
plot(x=c(0.04,.1,.5,.78,1.03),y=c(.40,.40,.40,.40,.40), type="l",  xlim=local.xlim, ylim=local.ylim,xlab=local.xlab,ylab=local.ylab,axes=FALSE, ann=FALSE)
par(new=TRUE)
text(.456, .83, expression(paste(bold(Top)," ", bold("20")," ", bold(Words))),cex=1,pos=4)
text(.038, .83, expression(paste(bold(Topic),)),cex=1,pos=4)
text(.912, .83, expression(paste(bold(Labels),)),cex=1,pos=4)
text(.082, .80, topwords[1],cex=.78,pos=4)
text(.082, .78, topwords[2],cex=.78,pos=4)
text(.082, .76, topwords[3],cex=.78,pos=4)
text(.082, .74, topwords[4],cex=.78,pos=4)
text(.082, .72, topwords[5],cex=.78,pos=4)
text(.082, .70, topwords[6],cex=.78,pos=4)
text(.082, .68, topwords[7],cex=.78,pos=4)
text(.082, .66, topwords[8],cex=.78,pos=4)
text(.082, .64, topwords[9],cex=.78,pos=4)
text(.082, .62, topwords[10],cex=.78,pos=4)
text(.082, .60, topwords[11],cex=.78,pos=4)
text(.082, .58, topwords[12],cex=.78,pos=4)
text(.082, .56, topwords[13],cex=.78,pos=4)
text(.082, .54, topwords[14],cex=.78,pos=4)
text(.082, .52, topwords[15],cex=.78,pos=4)
text(.082, .50, topwords[16],cex=.78,pos=4)
text(.082, .48, topwords[17],cex=.78,pos=4)
text(.082, .46, topwords[18],cex=.78,pos=4)
text(.082, .44, topwords[19],cex=.78,pos=4)
text(.082, .42, topwords[20],cex=.78,pos=4)
text(.0475, .80, topnumbers[1],cex=.85,pos=4)
text(.0475, .78, topnumbers[2],cex=.85,pos=4)
text(.0475, .76, topnumbers[3],cex=.85,pos=4)
text(.0475, .74, topnumbers[4],cex=.85,pos=4)
text(.0475, .72, topnumbers[5],cex=.85,pos=4)
text(.0475, .70, topnumbers[6],cex=.85,pos=4)
text(.0475, .68, topnumbers[7],cex=.85,pos=4)
text(.0475, .66, topnumbers[8],cex=.85,pos=4)
text(.0475, .64, topnumbers[9],cex=.85,pos=4)
text(.0475, .62, topnumbers[10],cex=.85,pos=4)
text(.0475, .60, topnumbers[11],cex=.85,pos=4)
text(.0475, .58, topnumbers[12],cex=.85,pos=4)
text(.0475, .56, topnumbers[13],cex=.85,pos=4)
text(.0475, .54, topnumbers[14],cex=.85,pos=4)
text(.0475, .52, topnumbers[15],cex=.85,pos=4)
text(.0475, .50, topnumbers[16],cex=.85,pos=4)
text(.0475, .48, topnumbers[17],cex=.85,pos=4)
text(.0475, .46, topnumbers[18],cex=.85,pos=4)
text(.0475, .44, topnumbers[19],cex=.85,pos=4)
text(.0475, .42, topnumbers[20],cex=.85,pos=4)
text(.88, .80, expression("Human Services"),cex=.85,pos=4)
text(.88, .78, expression("Covid-19"),cex=.85,pos=4)
text(.88, .76, expression("Safety & Environment"),cex=.85,pos=4)
text(.88, .74, expression("Detainees"),cex=.85,pos=4)
text(.88, .72, expression("Health Statistics"),cex=.85,pos=4)
text(.88, .70, expression("Social Statistics"),cex=.85,pos=4)
text(.88, .68, expression("Financial Regulation"),cex=.85,pos=4)
text(.88, .66, expression("Protest-Repression"),cex=.85,pos=4)
text(.88, .64, expression("Sexual Deviance"),cex=.85,pos=4)
text(.88, .62, expression("E-Health"),cex=.85,pos=4)
text(.88, .60, expression("Immigration"),cex=.85,pos=4)
text(.88, .58, expression("Legal Decisions"),cex=.85,pos=4)
text(.88, .56, expression("Labor"),cex=.85,pos=4)
text(.88, .54, expression("Population Statistics"),cex=.85,pos=4)
text(.88, .52, expression("Equity"),cex=.85,pos=4)
text(.88, .50, expression("Legal Research"),cex=.85,pos=4)
text(.88, .48, expression("Sex Workers"),cex=.85,pos=4)
text(.88, .46, expression("Refugees"),cex=.85,pos=4)
text(.88, .44, expression("Gov. Oversight"),cex=.85,pos=4)
text(.88, .42, expression("Procurement"),cex=.85,pos=4)